head(cars)str(cars)summary(cars)df_cars <- cars
<- と _ と %>% と `
back tick の確認View(cars) または、右上の Environment
から、df_cars をクリック?cars または Help 検索窓で cars,
head などおすすめ:Sys.setenv(LANG = "en")
R packages are extensions to the R statistical programming language containing code, data, and documentation in a standardised collection format that can be installed by users of R using Tool > Install Packages in the top menu bar of R Studio.
Rパッケージは、Rの拡張機能で、コード、データ、ドキュメントを標準化されたコレクション形式で含んでおり、標準的なものは、R Studio の Top Bar の Tool > Install Packages からインストールできます。
tidyverse, rmarkdown,
WDIあとから使うので、ロードしておきます。最初に次のようなコードを実行します。右の三角を押します。
library(tidyverse)
library(WDI)R Markdownはデータサイエンスのためのオーサリングフレームワーク。
コード(プログラム)とその実行結果、を記録・表示し、高品質のレポートの作成を可能にします。
R Notebook は、独立してインタラクティブに実行できるチャンクを持つR Markdownドキュメントの一つの形式で、入力のすぐ下に出力が表示することができます。
WDI(country = "all", indicator = c(gdp = "NY.GDP.MKTP.CD"),
extra=TRUE, cache=wdi_cache) %>% drop_na(gdp) %>%
filter(year==max(year), income !="Aggregates") %>%
drop_na(region) %>% arrange(desc(gdp))chosen_countries <- c("United States","China", "Japan", "Germany", "United Kingdom","India")
WDI(country = c("CN","GB","JP","IN","US","DE"), indicator = c(gdp = "NY.GDP.MKTP.CD"), extra=TRUE) %>% drop_na(gdp) %>%
ggplot(aes(year, gdp, col = country)) + geom_line() +
labs(title = "WDI NY.GDP.MKTP.CD: gdp")WDI(country = c("CN","IN","JP","US"),
indicator = c(gdp_growth_rate = "NY.GDP.MKTP.KD.ZG"), extra=TRUE) %>%
drop_na(gdp_growth_rate) %>%
ggplot(aes(year, gdp_growth_rate, col = country)) + geom_line() +
labs(title = paste("WDI NY.GDP.MKTP.KD.ZG: gdp growth rate"))The World Development Indicators is a compilation of relevant, high-quality, and internationally comparable statistics about global development and the fight against poverty. The database contains 1,400 time series indicators for 217 economies and more than 40 country groups, with data for many indicators going back more than 50 years.
WDIは、世界の開発状況と、貧困との戦いに関する、適切で上質、かつ、国際的に比較可能な時系列の統計データを編纂したものです。このデータベースは、217の経済と40以上の国グループについて1,400の時系列指標を含み、指標のデータの多くは50年以上前に遡ることができます。
いくつか、リストしてみましょう。
WDI
パッケージで、データをダウンロードしたり、探したり、詳細情報を得たりできます。
WDIsearch(string = "gdp", field = "name", short = TRUE, cache = NULL)WDIsearch(string = "NY.GDP.MKTP.CD", field = "indicator", short = TRUE, cache = NULL)名前で検索(“” の間に、(なるべく簡単な)検索文字列を入れてください。)
WDIsearch(string = "", field = "name", short = TRUE, cache = NULL)Indicator で検索(“” の間に、調べたい indicator を入れてください。)
WDIsearch(string = "", field = "indicator", short = TRUE, cache = NULL)short = FALSE
とします。時間がかかるので、検索は、Indicator
と、名前などの情報をもったファイルを手元に持っておくことにします。
wdi_cache <- WDIcache()右上の窓枠(pane)から、wdi_cache
を探して、中身を見てみましょう。series と、country
の二つのデータ・フレームからなっているリストです。三角印や、右から二番目の巻物のようなアイコンをクリックすると中身が見えます。
WDIsearch(string = "CPI Price", field = "name", short = FALSE, cache = wdi_cache)WDIsearch(string = "NY.GDP.MKTP.KD.ZG", field = "indicator", short = FALSE, cache = wdi_cache)string と、field
を、ふたつとも入れてください。
WDIsearch(string = "", field = "", short = FALSE, cache = wdi_cache)Indicator が決まったら、ダウンロードします。
?WDIdf_gdp1 <- WDI(country = "all", indicator = "NY.GDP.MKTP.CD")
df_gdp1df_gdp2 <- WDI(country = "all", indicator = c(gdp = "NY.GDP.MKTP.CD"))
df_gdp2df_gdp3 <- WDI(country = "all", indicator = c(gdp = "NY.GDP.MKTP.CD"), extra=TRUE, cache=wdi_cache)
df_gdp3df_gdp4 <- WDI(country = c("CN","GB","JP","IN","US","DE"), indicator = c(gdp = "NY.GDP.MKTP.CD"), extra=TRUE, cache=wdi_cache)
df_gdp4df_gdp21 <- WDI(country = "all",
indicator = c(gdp_deflator = "NY.GDP.DEFL.KD.ZG",
cpi_price = "CPTOTNSXN"),
extra=TRUE, cache=wdi_cache)
df_gdp21str(df_gdp21)'data.frame': 23972 obs. of 14 variables:
$ country : chr "Advanced Economies" "Advanced Economies" "Advanced Economies" "Advanced Economies" ...
$ iso2c : chr "AME" "AME" "AME" "AME" ...
$ iso3c : chr "" "" "" "" ...
$ year : int 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 ...
$ status : chr "" "" "" "" ...
$ lastupdated : chr "2020-07-27" "2020-07-27" "2020-07-27" "2020-07-27" ...
$ gdp_deflator: num NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "label")= chr "Inflation, GDP deflator (annual %)"
$ cpi_price : num 58.7 60.5 63 66 69.1 ...
..- attr(*, "label")= chr "CPI Price,not seas.adj,,,"
$ region : chr NA NA NA NA ...
$ capital : chr NA NA NA NA ...
$ longitude : chr NA NA NA NA ...
$ latitude : chr NA NA NA NA ...
$ income : chr NA NA NA NA ...
$ lending : chr NA NA NA NA ...
summary(df_gdp21) country iso2c iso3c year
Length:23972 Length:23972 Length:23972 Min. :1960
Class :character Class :character Class :character 1st Qu.:1982
Mode :character Mode :character Mode :character Median :1996
Mean :1995
3rd Qu.:2009
Max. :2021
status lastupdated gdp_deflator cpi_price
Length:23972 Length:23972 Min. : -98.704 Min. : 0.00
Class :character Class :character 1st Qu.: 2.317 1st Qu.: 55.95
Mode :character Mode :character Median : 5.273 Median : 83.28
Mean : 25.308 Mean : 84.18
3rd Qu.: 10.411 3rd Qu.:108.75
Max. :26765.858 Max. :551.25
NA's :11616 NA's :18410
region capital longitude latitude
Length:23972 Length:23972 Length:23972 Length:23972
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
income lending
Length:23972 Length:23972
Class :character Class :character
Mode :character Mode :character
右上の窓枠の、Environment も見てみましょう。
グラフ(Chart)を描いて視覚化しよう
df_gdp4 %>% ggplot(aes(year, gdp, col=country)) + geom_line()df_gdp4 %>% drop_na(gdp) %>%
ggplot(aes(year, gdp, col=country)) + geom_line() +
labs(title = paste("WDI - NY.GDP.MKTP.CD: ", "gdp"))Line Plot with one indicator with abbreviation and one country
chosen_indicator <- "SL.UEM.TOTL.NE.ZS"
short_name <- "unemployment"
chosen_country <- "United States"
WDI(country = "all", indicator = c(short_name = chosen_indicator), extra=TRUE, cache=wdi_cache) %>%
filter(country == chosen_country) %>%
ggplot(aes(year, short_name)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator, ": ", short_name, " - ", chosen_country),
y = short_name)Line Plot with one indicator and one country
chosen_indicator <- "SL.UEM.TOTL.NE.ZS"
chosen_country <- "United States"
WDI(country = "all", indicator = c(chosen_indicator = chosen_indicator),
extra=TRUE, cache=wdi_cache) %>%
filter(country == chosen_country) %>%
ggplot(aes(year, chosen_indicator)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator, " - ", chosen_country),
y = chosen_indicator)Line Plot with one indicator with abbreviation and several countries
chosen_indicator <- "SL.UEM.TOTL.NE.ZS"
short_name <- "unemployment"
chosen_countries <- c("United States","United Kingdom", "Japan")
WDI(country = "all", indicator = c(short_name = chosen_indicator), extra=TRUE, cache=wdi_cache) %>% drop_na(short_name) %>%
filter(country %in% chosen_countries) %>%
ggplot(aes(year, short_name, col = country)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator, ": ", short_name), y = short_name)Line Plot with two indicators with abbreviation and one country
chosen_indicator_1 <- "NY.GDP.DEFL.KD.ZG"
short_name_1 <- "gdp_deflator"
chosen_indicator_2 <- "CPTOTSAXNZGY"
short_name_2 <- "cpi_price"
chosen_country <- "United States"
WDI(country = "all", indicator = c(short_name_1 = chosen_indicator_1, short_name_2 = chosen_indicator_2), extra=TRUE, cache=wdi_cache) %>%
filter(country == chosen_country) %>%
pivot_longer(c(short_name_1, short_name_2), names_to = "class", values_to = "value") %>% drop_na(value) %>%
ggplot(aes(year, value, col = class)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator_1, ": ", short_name_1, "\n", chosen_indicator_2, ": ", short_name_2, " - ", chosen_country)) +
scale_color_manual(labels = c(short_name_1, short_name_2), values = scales::hue_pal()(2))chosen_indicator_1 <- "SL.TLF.CACT.MA.NE.ZS"
short_name_1 <- "male"
chosen_indicator_2 <- "SL.TLF.CACT.FE.NE.ZS"
short_name_2 <- "female"
chosen_country <- "United States"
WDI(country = "all", indicator = c(short_name_1 = chosen_indicator_1, short_name_2 = chosen_indicator_2), extra=TRUE, cache=wdi_cache) %>%
filter(country == chosen_country) %>%
pivot_longer(c(short_name_1, short_name_2), names_to = "class", values_to = "value") %>% drop_na(value) %>%
ggplot(aes(year, value, col = class)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator_1, ": ", short_name_1, "\n", chosen_indicator_2, ": ", short_name_2, " - ", chosen_country)) +
scale_color_manual(labels = c(short_name_1, short_name_2), values = scales::hue_pal()(2))Line Plot with two indicators with abbreviation and several countries
chosen_indicator_1 <- "NY.GDP.DEFL.KD.ZG"
short_name_1 <- "gdp_deflator"
chosen_indicator_2 <- "CPTOTSAXNZGY"
short_name_2 <- "cpi_price"
chosen_countries <- c("United States", "France", "Japan")
WDI(country = "all", indicator = c(short_name_1 = chosen_indicator_1, short_name_2 = chosen_indicator_2), extra=TRUE, cache=wdi_cache) %>%
filter(country %in% chosen_countries) %>%
pivot_longer(c(short_name_1, short_name_2), names_to = "class", values_to = "value") %>% drop_na(value) %>%
ggplot(aes(year, value, linetype = class, col = country)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator_1, ": ", short_name_1, "\n", chosen_indicator_2, ": ", short_name_2)) +
scale_linetype_manual(labels = c(short_name_1, short_name_2), values = c("solid", "dashed"))chosen_indicator_1 <- "SL.TLF.CACT.MA.NE.ZS"
short_name_1 <- "male"
chosen_indicator_2 <- "SL.TLF.CACT.FE.NE.ZS"
short_name_2 <- "female"
chosen_countries <- c("United States", "France", "Japan")
WDI(country = "all", indicator = c(short_name_1 = chosen_indicator_1, short_name_2 = chosen_indicator_2), extra=TRUE, cache=wdi_cache) %>%
filter(country %in% chosen_countries) %>%
pivot_longer(c(short_name_1, short_name_2), names_to = "class", values_to = "value") %>% drop_na(value) %>%
ggplot(aes(year, value, linetype = class, col = country)) + geom_line() +
labs(title = paste("WDI ", chosen_indicator_1, ": ", short_name_1, "\n", chosen_indicator_2, ": ", short_name_2)) +
scale_linetype_manual(labels = c(short_name_1, short_name_2), values = c("solid", "dashed"))上のテンプレートをコピーして、下に貼り付け、指標
indicator と、略称 short_name
と、いくつかの国名 chosen_countries
を、入れ替えて、試してみてください。
EDAは、データが何を語っているかを理解するための反復的なサイクルです。
まず、データに関する問いを作成します。
データの可視化、変換、モデリングを行い、答えを探します。
学習したことを活用して、問いを修正したり、新しい問いを考えたりします。そして、このサイクルを繰り返していきます。
EDAはデータ分析において重要な役割を果たします。また、データの品質を保証するために、データの質を確認するために使用することもできます。
R4DS からのイメージ
スタートは、本来は、データの作成・探索ですが、すでに、分析したいデータはすでにあるとして話を進めます。まずは、data
フォルダ(directory)を作成しておくと良い。右下の窓枠の Files
タブから、New Folder で作成してもよい。
dir.create("./data")データの取得・読み込みを、四つの方法に分けて説明します。
write(df_name, "./data/name.csv")df_name <- read_csv("./data/file_name.csv")df_name <- read_csv(url_of_a_csv)library(readxl)。df_name <- read_excel("./data/file_name.xlsx")df_name <- read_delim(clipboard())WDIcache() の扱い二つの、ファイルが一つになった、リストであるため、違って命令を使います。
wdi_cache <- WDIcache()
write_rds(wdi_cache, "./wdi_cache.RData")wdi_cache <- read_rds("./wdi_cache.RData")url_un_pop <- "https://data.un.org/_Docs/SYB/CSV/SYB65_1_202209_Population,%20Surface%20Area%20and%20Density.csv"
df_un_pop0 <- read_csv(url_un_pop)New names:Rows: 7874 Columns: 7── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (7): T02, Population, density and surface area, ...3, ...4, ...5, ...6, ...7
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_pop0url_un_pop <- "https://data.un.org/_Docs/SYB/CSV/SYB65_1_202209_Population,%20Surface%20Area%20and%20Density.csv"
df_un_pop <- read_csv(url_un_pop, skip=1)New names:Rows: 7873 Columns: 7── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (4): ...2, Series, Footnotes, Source
dbl (2): Region/Country/Area, Year
num (1): Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_un_popdf_un_pop %>% distinct(`Region/Country/Area`, `...2`)df_un_pop %>% filter(`Region/Country/Area` %in% c(2,19,142,150,9), Series == "Population mid-year estimates (millions)") %>%
ggplot(aes(Year, Value, fill = `...2`)) + geom_area(col="black") +
labs(title = "Population mid-year estimates (millions) of the World")GDP per hour worked is a measure of labour productivity. It measures how efficiently labour input is combined with other factors of production and used in the production process. Labour input is defined as total hours worked of all persons engaged in production. Labour productivity only partially reflects the productivity of labour in terms of the personal capacities of workers or the intensity of their effort. The ratio between the output measure and the labour input depends to a large degree on the presence and/or use of other inputs (e.g. capital, intermediate inputs, technical, organisational and efficiency change, economies of scale). This indicator is measured in USD (constant prices 2010 and PPPs) and indices.
労働時間当たりGDPは、労働生産性の指標である。これは、労働投入量が他の生産要素と組み合わされ、生産プロセスでどれだけ効率的に利用されたかを測定するものである。労働投入量は、生産に従事するすべての人の総労働時間として定義される。労働生産性は、労働者の個人的能力や努力の強さといった労働の生産性を部分的にしか反映していない。アウトプット指標と労働投入量の比率は、他の投入物(資本、中間投入物、技術・組織・効率の変化、規模の経済など)の存在や利用に大きく左右される。この指標は、米ドル(2010年の恒常価格およびPPP)および指標で測定されています。
df_oecd_productivity <- read_csv("./data/DP_LIVE_21022023111712065.csv")Rows: 3894 Columns: 8── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (6): LOCATION, INDICATOR, SUBJECT, MEASURE, FREQUENCY, Flag Codes
dbl (2): TIME, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_oecd_productivitydf_oecd_productivity$LOCATION %>% unique() [1] "AUS" "AUT" "BEL" "CAN" "CZE" "DNK"
[7] "FIN" "FRA" "DEU" "GRC" "HUN" "ISL"
[13] "IRL" "ITA" "JPN" "KOR" "LUX" "MEX"
[19] "NLD" "NZL" "NOR" "POL" "PRT" "SVK"
[25] "ESP" "SWE" "CHE" "TUR" "GBR" "USA"
[31] "CHL" "EST" "ISR" "RUS" "SVN" "OECD"
[37] "EU28" "G-7" "LVA" "LTU" "EA19" "ZAF"
[43] "CRI" "BGR" "HRV" "ROU" "EU27_2020" "COL"
df_oecd_productivity$INDICATOR %>% unique()[1] "GDPHRWKD"
df_oecd_productivity$SUBJECT %>% unique()[1] "TOT"
df_oecd_productivity$MEASURE %>% unique()[1] "USD" "IDX2015"
df_oecd_productivity$FREQUENCY %>% unique()[1] "A"
df_oecd_productivity$TIME %>% unique()df_oecd_productivity %>%
filter(MEASURE == "USD", TIME == 2021) %>%
select(LOCATION, Value) %>%
arrange(desc(Value))df_oecd_productivity %>%
filter(LOCATION %in% c("JPN", "OECD", "G-7", "EU28")) %>%
filter(MEASURE == "USD") %>%
ggplot(aes(TIME, Value, col = LOCATION)) + geom_line() +
labs(title="GDP per hour worked", subtitle="Total, 2015=100, 2021 or latest available")Adult education level: https://data.oecd.org/eduatt/adult-education-level.htm
df_oecd_education_level <- read_csv("./data/DP_LIVE_21022023120132654.csv")Rows: 7330 Columns: 8── Column specification ──────────────────────────────────────────────────────────
Delimiter: ","
chr (5): LOCATION, INDICATOR, SUBJECT, MEASURE, FREQUENCY
dbl (2): TIME, Value
lgl (1): Flag Codes
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_oecd_education_levellibrary(readxl)url_summary <- "https://wir2022.wid.world/www-site/uploads/2022/03/WIR2022TablesFigures-Summary.xlsx"
download.file(url = url_summary, destfile = "./data/WIR2022s.xlsx", mode = "wb") trying URL 'https://wir2022.wid.world/www-site/uploads/2022/03/WIR2022TablesFigures-Summary.xlsx'
Content type 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' length 908659 bytes (887 KB)
==================================================
downloaded 887 KB
excel_sheets("./data/WIR2022s.xlsx") [1] "Index" "F1" "F2" "F3" "F4" "F5."
[7] "F6" "F7" "F8" "F9" "F10" "F11"
[13] "F12" "F13" "F14" "F15" "T1" "data-F1"
[19] "data-F2" "data-F3" "data-F4" "data-F5" "data-F6" "data-F7"
[25] "data-F8" "data-F9" "data-F10" "data-F11" "data-F12" "data-F13."
[31] "data-F14." "data-F15"
df1_wir <- read_excel("./data/WIR2022s.xlsx", sheet = "data-F1")New names:
df1_wirdf1_wir %>% select(cat = ...1, 2:4) %>%
pivot_longer(2:4, names_to = "group", values_to = "value") %>%
ggplot(aes(x = cat, y = value, fill = group)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
geom_text(aes(x = cat, y = value, group = group, label = scales::label_percent(accuracy=1)(value)), vjust = -0.08,
position = position_dodge(0.9)) +
labs(title = "Figure 1. Global income and wealth inequality, 2021",
x = "", y = "Share of total income or wealth", fill = "")df2_wir <- read_delim(clipboard())Rows: 8 Columns: 5── Column specification ──────────────────────────────────────────────────────────
Delimiter: "\t"
chr (4): iso, Bottom 50%, Middle 40%, Top 10%
dbl (1): year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2_wirdf2_wir2 <- read_excel("./data/WIR2022s.xlsx", sheet = "data-F2")
df2_wir2df2_wir2 %>% pivot_longer(3:5, names_to = "level", values_to = "value") %>%
ggplot(aes(x = iso, y = value, fill = level)) +
geom_col(position = "dodge") +
scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 8)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(title = "Figure 2. The poorest half lags behind Bottom 50%, middle 40% \nand top 10% income shares across the world in 2021",
x = "", y = "Share of national income (%)", fill = "")4.1. Look at the data: suppose df is the data frame
dt <- as_tibble(df)head(df), str(df),
summary(df), dt, glimpse(dt)4.2. Look at each variable
4.3. Variation of each data: suppose x1 is a column
name.
df %>% ggplot() + geom_histogram(aes(x1), bins = 30)
df %>% drop_na(x1): see the rows with a value in
x1. If the value is NA, the row is not shown.
df_wo_na <- df %>% drop_na(x1) if you want to use
only the rows without NA in x14.4. Use dpylr and tidyr to change column
names, tidy data, and/or summarize data
rename, select, filter,
arrange, mutate, pivot_longer(),
pivot_wider(), group_by and
summarize5.1. In combination with Stap 4 - data transformation, try various data visualization.
5.2. Keep a record of what you can observe by the visualization
5.3. Edit the list of questions by adding or polishing
5.4. Select several informative chart and add options
5.5. Look at examples from the textbooks or teaching site to have better visualization
ggplot2EDA is an iterative cycle that helps you understand what your data says. When you do EDA, you:
Generate questions about your data
Search for answers by visualising, transforming, and/or modeling your data
Use what you learn to refine your questions and/or generate new questions
EDA is an important part of any data analysis. You can use EDA to make discoveries about the world; or you can use EDA to ensure the quality of your data, asking questions about whether the data meets your standards or not.
dplyr, tidyrggplot2geom_smooth()学校種類別進学率の推移: https://empowerment.tsuda.ac.jp/detail/82584
url_school_jp <- "https://www.gender.go.jp/about_danjo/whitepaper/r02/zentai/html/honpen/csv/zuhyo01-04-01.csv"
guess_encoding(url_school_jp, n_max = 10000, threshold = 0.2)df_school_jp <- read_csv(url_school_jp, locale = locale(encoding = "Shift_JIS"), skip=2)
df_school_jp df_edu0 <- df_school_jp
colnames(df_edu0) <- c("year", "highschool_m", "highschool_f", "vocational_m", "vocational_f", "university_m", "university_f", "juniorcollege_f", "gradschool_m", "gradschool_f")
df_edu00 <- df_edu0 %>% mutate(year = 1950:2019,
highschool = (highschool_m + highschool_f)/2,
vocational = (vocational_m + vocational_f)/2,
university = (university_m + university_f)/2,
juniorcollege = juniorcollege_f,
gradschool = (gradschool_m + gradschool_f)/2)
df_edu00 %>% filter(year >= 1954) %>% select(-(2:10)) %>%
pivot_longer(3:5, names_to = "schools", values_to = "percentage") %>%
mutate(types = factor(schools, levels = c("vocational", "juniorcollege", "university"))) %>%
pivot_longer(c(highschool, gradschool), names_to = "highgrad", values_to ="value") %>%
mutate(high_grad = factor(highgrad, levels = c("highschool", "gradschool"))) %>%
ggplot() +
geom_area(aes(x = year, y = percentage, fill = types)) +
geom_line(aes(x = year, y = value, linetype = high_grad)) +
scale_x_continuous(breaks = round(seq(1960, 2020, by =10),1)) +
scale_y_continuous(breaks = round(seq(0, 100, by =10),1)) +
labs(title = "Tertially Education After Highschool",
subtitle = "with Highschool Graduates and Graduate School", fill = "", linetype = "")filter(), select(),
arrange(), mutate(), group_by(),
summarize()There is no rule about which questions you should ask to guide your research. However, two types of questions will always be useful for making discoveries within your data. You can loosely word these questions as:
下のリンクを開き、右上の Code ボタンから、Download Rmd を選択すると、ダウンロードできますから、ダインロードしたものを、プロジェクト・フォールダーに移動またはコピーしてください。ダウンロードできないときは、Ctrl を押しながら、Download Rmd をクリックすると、Save As で保存できると思います。ブラウザーによって仕様が異なりますから、適切な方法を選んでください。
Windows でも、Mac でも提供されている、Google Chrome の場合には、Code ボタンから、ダンロードされるはずです。
RStudio Cloudは、誰でもオンラインでデータサイエンスを行い、共有し、教え、学ぶことができる、軽量でクラウドベースのソリューションです。
Posit Primers https://posit.cloud/learn/primers
R For Data Science, by H. Wickham: https://r4ds.had.co.nz
Bookdown: https://bookdown.org, Archive
R Markdownはデータサイエンスのためのオーサリングフレームワーク。
コード(プログラム)とその実行結果、を記録・表示し、高品質のレポートの作成を可能にします。
R Notebook は、独立してインタラクティブに実行できるチャンクを持つR Markdownドキュメントの一つの形式で、入力のすぐ下に出力が表示することができます。
下のリンクを開き、右上の Code ボタンから、Download Rmd を選択すると、ダウンロードできますから、ダインロードしたものを、プロジェクト・フォールダーに移動またはコピーしてください。ダウンロードできないときは、Ctrl を押しながら、Download Rmd をクリックすると、Save As で保存できると思います。ブラウザーによって仕様が異なりますから、適切な方法を選んでください。
Windows でも、Mac でも提供されている、Google Chrome の場合には、Code ボタンから、ダンロードされるはずです。